library(kernlab)
library(KernSmooth)
library(MASS)
library("ks")
library(foreach)
library(glmnet)
library(randomForest)
library(doParallel)
require(tidyverse)
require(ggplot2)
library(ggpubr)
library(caret)
library(nnet)
#library(ROSE)
#library(ranger)
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
source("functions_1210.R")
source("algoclass_1211.R")
data0<-read.table("abalone.txt",header=T,sep = ",")
data0[data0=="?"]=NA
data1<-na.omit(data0)
#��????��
colnames(data1)
#��????��ת??Ϊ??ֵ?Ͳ????????ͱ?��?ϲ?
colname=colnames(data1)
y=as.numeric(data1$X15)
dummy <- dummyVars(" ~ .", data=data1[,-length(colname)])
#perform one-hot encoding on data frame
data <- data.frame(predict(dummy, newdata=data1))
data$y=y
Number=dim(data)[1]
d=dim(data)[2]-1
##??????????
n=2000
N=2000
algo=new("NN")
g = 'exp'
n_train=round(n*0.5)#???Լ?????????
n_cal=round(n*0.5)#ѵ��??????????
nseq=cumsum(c(n,N))
lambda=500
SCPV<-function(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha){
Null_cal=which((Y_cal<=b_0)&(T_cal>L))
Null_test=which((Y_test<=b_0)&(T_test>L))
Select_cal=which(T_cal>L)
Select_test=which(T_test>L)
Al_test=which((Y_test>b_0)&(T_test>L))
Result_select=ResultCompute(Null_cal,Select_cal,Null_test,Select_test,Al_test,V_cal,V_test,alpha)
return(Result_select)
}
OMT<-function(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha,Bonfer=FALSE,BY=FALSE){
Null_cal=which(Y_cal<=b_0)
Null_test=which((Y_test<=b_0)&(T_test>L))
Al_test=which((Y_test>b_0)&(T_test>L))
Select_cal=which(Y_cal<=max(Y_cal))
Select_test=which(T_test>L)
if(Bonfer==TRUE)
{
if(BY==TRUE)
{
Result_select=ResultCompute(Null_cal,Select_cal,Null_test,Select_test,Al_test,V_cal,V_test,alpha*length(which(T_test>L))/(length(V_test)*log(length(which(T_test>L)))))
}else{
Result_select=ResultCompute(Null_cal,Select_cal,Null_test,Select_test,Al_test,V_cal,V_test,alpha*length(which(T_test>L))/length(V_test))
}
}else{
Result_select=ResultCompute(Null_cal,Select_cal,Null_test,Select_test,Al_test,V_cal,V_test,alpha)
}
return(Result_select)
}
SCOP<-function(Y_cal,Y_cal_hat,Y_test_hat,b_0,T_test,T_cal,L,alpha){
V_cal=Y_cal-Y_cal_hat
V_test=b_0-Y_test_hat
Null_cal=which(Y_cal<=b_0)
Null_test=which((Y_test<=b_0)&(T_test>L))
Al_test=which((Y_test>b_0)&(T_test>L))
Select_cal=which(T_cal>L)
Select_test=which(T_test>L)
V_cal_Null=V_cal
p<-c()
for (i in 1:length(V_test)) {
p[i]<-(length(V_cal_Null[V_cal_Null<V_test[i]])+1)/(length(V_cal_Null)+1)
}
TrueNull_test=p[Null_test]
Reject_test=p[Select_test]
FDP=length(TrueNull_test[TrueNull_test<alpha])/(length(Reject_test[Reject_test<alpha])+1)
TrueAl_test=p[Al_test]
Power=length(TrueAl_test[TrueAl_test<alpha])/length(Al_test)
return(list(FDP=FDP,Power=Power))
}
ns=100#?ظ?ʵ??????
ResultCompute<-function(Null_cal,Select_cal,Null_test,Select_test,Al_test,V_cal,V_test,alpha){
V_cal_Null=V_cal[Null_cal]
p<-c()
for (i in 1:length(V_test)) {
p[i]<-(length(V_cal_Null[V_cal_Null<=V_test[i]])+1)/(length(V_cal_Null)+1)
}
p<-p*length(Null_cal)/length(Select_cal)
p_adj=p.adjust(p[Select_test],method = "BH",n=length(p[Select_test]))
k_hat=length(p_adj[p_adj<alpha])
TrueNull_test=p[Null_test]
FDP=length(TrueNull_test[TrueNull_test<(k_hat*alpha/length(p_adj))])/(k_hat+1)
TrueAl_test=p[Al_test]
Power=length(TrueAl_test[TrueAl_test<(k_hat*alpha/length(p_adj))])/length(Al_test)
return(list(FDP=FDP,Power=Power))
}
alpha=0.1
Thresholding_value=0.7
Thresholding_type_array=c("test","exchange","mean")
sr=0.8
info<-data.frame()
datawork=DataSplit(data,n+N,N,n_cal,n)
data_train=datawork$data_train
data_cal=datawork$data_cal
data_rest=datawork$data_rest
data_test=datawork$data_test
X_train=data_train[colnames(data_train)[-d-1]]
Y_train=as.matrix(data_train$y)
X_cal=data_cal[colnames(data_cal)[-d-1]]
Y_cal=as.matrix(data_cal$y)
X_rest=data_rest[colnames(data_rest)[-d-1]]
Y_rest=as.matrix(data_rest$y)
X_test=data_test[colnames(data_test)[-d-1]]
Y_test=as.matrix(data_test$y)
model=randomForest(y~.,data=data.frame(X_train,y=Y_train),ntree=500)
#model=nnet(y~., data = data.frame(X_train,y=as.factor(Y_train)), size = 20,decay=5e-4, maxit = 2000)
Y_cal_hat=predict(model,X_cal)
Y_test_hat=predict(model,X_test)
#Y_cal_hat=as.numeric(predict(model,X_cal,type="raw"))
#Y_test_hat=as.numeric(predict(model,X_test,type="raw"))
#T_test=X_test[,"Age"]
#T_cal=X_cal[,"Age"]
T_test=X_test[,10]
T_cal=X_cal[,10]
quantile(Y_test,0.7)
X_test
library(kernlab)
library(KernSmooth)
library(MASS)
library("ks")
library(foreach)
library(glmnet)
library(randomForest)
library(doParallel)
require(tidyverse)
require(ggplot2)
library(ggpubr)
library(caret)
library(nnet)
#library(ROSE)
#library(ranger)
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
source("functions_1210.R")
source("algoclass_1211.R")
data0<-read.table("abalone.txt",header=T,sep = ",")
data0[data0=="?"]=NA
data1<-na.omit(data0)
#��????��
colnames(data1)
#��????��ת??Ϊ??ֵ?Ͳ????????ͱ?��?ϲ?
colname=colnames(data1)
y=as.numeric(data1$X15)
dummy <- dummyVars(" ~ .", data=data1[,-length(colname)])
#perform one-hot encoding on data frame
data <- data.frame(predict(dummy, newdata=data1))
data$y=y
Number=dim(data)[1]
d=dim(data)[2]-1
##??????????
n=2000
N=2000
algo=new("NN")
g = 'exp'
n_train=round(n*0.5)#???Լ?????????
n_cal=round(n*0.5)#ѵ��??????????
nseq=cumsum(c(n,N))
lambda=500
nfo<-data.frame()
datawork=DataSplit(data,n+N,N,n_cal,n)
data_train=datawork$data_train
data_cal=datawork$data_cal
data_rest=datawork$data_rest
data_test=datawork$data_test
X_train=data_train[colnames(data_train)[-d-1]]
Y_train=as.matrix(data_train$y)
X_cal=data_cal[colnames(data_cal)[-d-1]]
Y_cal=as.matrix(data_cal$y)
X_rest=data_rest[colnames(data_rest)[-d-1]]
Y_rest=as.matrix(data_rest$y)
X_test=data_test[colnames(data_test)[-d-1]]
Y_test=as.matrix(data_test$y)
model=randomForest(y~.,data=data.frame(X_train,y=Y_train),ntree=500)
#model=nnet(y~., data = data.frame(X_train,y=as.factor(Y_train)), size = 20,decay=5e-4, maxit = 2000)
Y_cal_hat=predict(model,X_cal)
Y_test_hat=predict(model,X_test)
#Y_cal_hat=as.numeric(predict(model,X_cal,type="raw"))
#Y_test_hat=as.numeric(predict(model,X_test,type="raw"))
#T_test=X_test[,"Age"]
#T_cal=X_cal[,"Age"]
T_test=X_test[,10]
T_cal=X_cal[,10]
quantile(Y_test,0.8)
library(kernlab)
library(KernSmooth)
library(MASS)
library("ks")
library(foreach)
library(glmnet)
library(randomForest)
library(doParallel)
require(tidyverse)
require(ggplot2)
library(ggpubr)
library(caret)
library(nnet)
#library(ROSE)
#library(ranger)
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
source("Functions.R")
source("AlgorithmClass.R")
data0<-read.table("abalone.txt",header=T,sep = ",")
data0[data0=="?"]=NA
data1<-na.omit(data0)
colnames(data1)
View(data0)
y=as.numeric(data1$X15)
colname=colnames(data1)
colname
dummyVars(" ~ .", data=data1[,-length(colname)])
dummy <- dummyVars(" ~ .", data=data1[,-length(colname)])
dummy
data.frame(predict(dummy, newdata=data1))
##read and process the data
data0<-read.table("train.csv",header=T,sep = ",")
data0$education=sapply(data0$education,function(t){return(strsplit(t,',')[[1]][1])})
colnames(data0)[13]<-"y"
data0=na.omit(data0)
continvar=subset(data0,select=-c(region,department,education,gender,recruitment_channel,y,employee_id))
continvar=as.data.frame(scale(continvar,center = TRUE,scale = TRUE))
typevar=subset(data0,select=c(department,education,gender,recruitment_channel))
typevar=lapply(typevar, function(x){factor(x)})
data=cbind(continvar,typevar,y=data0$y)
Number=dim(data)[1]
sr=1-sum(data$y==1)/Number
d=dim(data)[2]-1
n=2000
N=2000
n_train=round(n*0.5)
n_cal=round(n*0.5)
ns=100
alpha=0.1
Thresholding_value=0.7
Thresholding_type_array=c("test","exchange","mean")
sr=0.8
# Null_cal=which((data_cal$y==1)&(data_cal$fit1.cluster==2))
# Null_test=which((data_test$y==1)&(data_test$fit1.cluster==2))
##read and process the data
data0<-read.table("abalone.txt",header=T,sep = ",")
data0[data0=="?"]=NA
data1<-na.omit(data0)
colnames(data1)
colname=colnames(data1)
y=as.numeric(data1$X15)
dummy <- dummyVars(" ~ .", data=data1[,-length(colname)])
data <- data.frame(predict(dummy, newdata=data1))
data$y=y
Number=dim(data)[1]
d=dim(data)[2]-1
n=2000
N=2000
n_train=round(n*0.5)
n_cal=round(n*0.5)
ns=100#replication times
alpha=0.1
Thresholding_value=0.7
Thresholding_type_array=c("test","exchange","mean")
sr=0.8
cl = makeCluster(10)
registerDoParallel(cl)
result<-foreach(iter=1:ns,.combine="rbind",.packages = c('MASS',"randomForest","kernlab","nnet"))%dopar% {
info<-data.frame()
datawork=DataSplit(data,n+N,N,n_cal,n)
data_train=datawork$data_train
data_cal=datawork$data_cal
data_rest=datawork$data_rest
data_test=datawork$data_test
X_train=data_train[colnames(data_train)[-d-1]]
Y_train=as.matrix(data_train$y)
X_cal=data_cal[colnames(data_cal)[-d-1]]
Y_cal=as.matrix(data_cal$y)
X_rest=data_rest[colnames(data_rest)[-d-1]]
Y_rest=as.matrix(data_rest$y)
X_test=data_test[colnames(data_test)[-d-1]]
Y_test=as.matrix(data_test$y)
model=randomForest(y~.,data=data.frame(X_train,y=Y_train),ntree=500)
Y_cal_hat=predict(model,X_cal)
Y_test_hat=predict(model,X_test)
T_test=X_test[,10]
T_cal=X_cal[,10]
b_0=12
for (Thresholding_type in Thresholding_type_array) {
if(Thresholding_type=="constant")
{L=0.30}else if(Thresholding_type=="test")
{L=quantile(T_test,Thresholding_value)}else if(Thresholding_type=="exchange")
{L=quantile(c(T_test,T_cal),Thresholding_value)}else if(Thresholding_type=="mean")
{L=mean(T_test)}
V_cal=-Y_cal_hat
V_test=-Y_test_hat
Result_OMT=OMT(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha)
info=rbind(info,list(FDP=Result_OMT$FDP,Power=Result_OMT$Power,Method="OMT",Thresholding_type=Thresholding_type))
#Bonferroni
Result_Bonfer=OMT(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha,Bonfer = TRUE,BY=FALSE)
info=rbind(info,list(FDP=Result_Bonfer$FDP,Power=Result_Bonfer$Power,Method="Bonferroni",Thresholding_type=Thresholding_type))
#BY
Result_BY=OMT(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha,Bonfer = TRUE,BY=TRUE)
info=rbind(info,list(FDP=Result_BY$FDP,Power=Result_BY$Power,Method="BY",Thresholding_type=Thresholding_type))
#SCPV
if(Thresholding_type=="mean"){
Result_SCPV=MeanSCPV(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha)
}else{
Result_SCPV=SCPV(Y_cal,Y_test,b_0,T_cal,T_test,L,V_cal,V_test,alpha)
}
info=rbind(info,list(FDP=Result_SCPV$FDP,Power=Result_SCPV$Power,Method="SCPV",Thresholding_type=Thresholding_type))
}
return(info)
}
stopCluster(cl)
tab=result%>%
group_by(Method,Thresholding_type)%>%
dplyr::summarize(FDR=mean(FDP),FDR_sd=sd(FDP),Power_sd=sd(Power),Power=mean(Power))
tab
